This analysis is to explore the best telomere mixed model to fit the CML colony data
knitr::opts_chunk$set(collapse = TRUE)
library(tidyverse)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(ggbeeswarm)
library(ggtext)
library(ggeffects)
library(ggnewscale)
library(ggh4x)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(patchwork)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(optimx)
library(dfoptim)
library(lmeresampler)
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(performance)
library(parameters)
import::from(.from = "dplyr", count, select, rename, slice)
# Add totals to df
addTotal <-
function(df) {
df %>% mutate(across(where(is.logical), as.character)) %>%
bind_rows(summarise(.,
across(where(is.numeric), sum, na.rm = T),
across(where(is.character), ~ "Total")))
}
# Lmer optimiser QC
# allfit summary
all_fit.summary <- function(all_fit) {
broom.mixed::glance(all_fit) %>% select(optimizer, AIC, BIC, NLL_rel) %>% arrange(NLL_rel)
}
# allfit compare estimated ranked by fit metrics
all_fit.compare <- function(all_fit) {
Reduce(left_join, list(
broom.mixed::glance(all_fit),
Reduce(bind_rows, list(broom.mixed::tidy(all_fit) %>% filter(effect == "fixed"),
lapply(all_fit, function(x){ x@optinfo$conv$opt}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "lme4.optimizer_code"),
lapply(all_fit, function(x){ as.numeric(isSingular(x))}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "isSingular"),
lapply(all_fit, function(x){ as.numeric(check_convergence(x))}) %>% unlist() %>% enframe(name = "optimizer", value = "estimate") %>% mutate(term = "Converged")
)))) %>%
ggplot(aes(reorder(optimizer, -NLL_rel), estimate, colour = optimizer)) +
geom_point() +
facet_grid(0 ~ term, scale = "free_x") +
coord_flip() + guides(colour = "none") +
theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1))
}
# Model check
model.check <- function(model) {
n <- rlang::expr_name(rlang::enexpr(model))
res <- enframe(c(
"Formula" = unlist(str_split(as.character(model@call), pattern = "\\="))[2],
"Observations" = nrow(model@frame),
"REML" = isREML(model),
"Optimizer" = model@optinfo$optimizer,
"Singular" = isSingular(model),
"Converged" = check_convergence(model)
)) %>% tab_df(title = paste0("Model ", n),
col.header = c("Test", "Value"))
return(res)
}
# get model formula
get_formula <- function(model) {
str_sub(unlist(str_split(as.character(model@call), pattern = "\\="))[2])
}
per_sample_statistics.n834.df <-
read_csv("../data/per_sample_statistics.n834.csv")
## New names:
## Rows: 834 Columns: 32
## ── Column specification
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── Delimiter: "," chr
## (7): Patient, Sample, tip.label, driver3, platform_unit, run_id.uniq, status dbl (25): ...1, age_at_sample_exact, meanvaf, rho.bb, meandepth, sensitivity, F1, F2, F4, Psi,
## Insert_mean, Insert_sd, Read_length, Initial_read_length, F2a, Length, Se...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
driver.palette <- c(`Wild-type` = "#0072B5FF", Other ="#E18727FF", `*BCR::ABL1*`="#BC3C29FF")
driver.palette1 <- c(WT = "#0072B5FF", Other ="#E18727FF", BCR_ABL="#BC3C29FF")
driver.palette2 <- c(Wt = "#0072B5FF", Mt="#BC3C29FF")
driver.palette2b <- c(Absence = "#717D6E", Presence="#BC3C29FF")
# BCR::ABL1-mutated vs non mutated
per_sample_statistics.n834.df %<>%
mutate(BCR_ABL1 = ifelse(grepl("BCR::ABL1", driver3, perl = T), "Mt", "Wt"))
# set Wt as ref
per_sample_statistics.n834.df %<>% mutate(BCR_ABL1 = factor(BCR_ABL1, levels = c("Wt", "Mt")))
# BCR::ABL1-mutated vs Other driver vs wildtype
per_sample_statistics.n834.df %<>% mutate(driver2 = ifelse(driver3 == "WT", "WT", ifelse(
grepl("BCR::ABL1", driver3, perl = T),
"BCR_ABL",
"Other"
)))
per_sample_statistics.n834.df %<>% mutate(driver2 = factor(driver2, levels = c("WT", "Other", "BCR_ABL")))
# Set batch variable as character
per_sample_statistics.n834.df %<>% mutate(library.cluster = as.character(library.cluster))
In total 834 samples passed QC across 9 patients. Age (age_at_sample_exact) was defined as the count of completed years from birth at sampling and sample mutant status (BCR_ABL1) was defined as BCR::ABL1 positive (Mt; n=365) or negative (Wt; n=469).
per_sample_statistics.n834.df %>%
count(Patient, BCR_ABL1) %>%
pivot_wider(names_from = BCR_ABL1, values_from = n) %>%
addTotal() %>%
tab_df()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), sum, na.rm = T)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
| Patient | Wt | Mt |
|---|---|---|
| PD51632 | 24 | 59 |
| PD51633 | 77 | 4 |
| PD51634 | 48 | 75 |
| PD51635 | 85 | 32 |
| PD56961 | 105 | 58 |
| PD57332 | NA | 35 |
| PD57333 | 75 | NA |
| PD57334 | 44 | 11 |
| PD57335 | 11 | 91 |
| Total | 469 | 365 |
attr(per_sample_statistics.n834.df$Length, "label") <- "Mean telomere length (bp)"
attr(per_sample_statistics.n834.df$nsub_adj, "label") <- "SNV burden (depth adjusted)"
attr(per_sample_statistics.n834.df$age_at_sample_exact, "label") <- "Age (years)"
attr(per_sample_statistics.n834.df$BCR_ABL1, "label") <- "BCR::ABL1 status"
attr(per_sample_statistics.n834.df$driver2, "label") <- "Driver status"
Mitchell et al (2022) showed that telomerecat telomere length estimates from NovaSeq sequenced samples are problematic with very high estimates and zero length estimates compared to HiSeqX sequenced samples.
We hypothesised that a likely contributing cause is the change from 4 colour (HiSeqX) to 2 colour (NovaSeq) sequencing. Therefore NovaSeq sequencing outputs a “G” when there is a “G” base and when there is an error, thus causing misattribution of telomeric reads by telomerecat.
Reads containing telomeric repeats (from telomerecat telbam) show a marked increase in “G” content after 75bp.
Running Telomerecat with -t 75 “trims” read attribution to the first 75bp resulting in a decrease in outlier estimates.
Plotting the trimmed telomerecat mean telomere lengths (bp) vs. age at sampling cohort (n=834) shows:
The Wild-type trend is similar to the prior estimate from Mitchell et al.
In most patients the BCR::ABL1 mutated samples have shorter telomere lengths
There seems to be no overt difference between wild-type and Other driver samples
# Setup facet xlim
x_lim.df <-
per_sample_statistics.n834.df %>% group_by(Patient) %>% summarise(min = min(age_at_sample_exact),
max = max(age_at_sample_exact))
x_scale <- lapply(1:nrow(x_lim.df), function(x) {
y = round(x_lim.df$min[x] + ((x_lim.df$max[x] - x_lim.df$min[x]) / 2))
scale_x_continuous(limits = c(y - 7, y + 7), breaks = pretty_breaks())
})
names(x_scale) <- x_lim.df$Patient
per_sample_statistics.n834.df %>%
mutate(
Patient = factor(
Patient,
levels = c(
"PD57334",
"PD56961",
"PD57335",
"PD51634",
"PD51633",
"PD51632",
"PD51635",
"PD57333",
"PD57332"
)
),
`*BCR::ABL1* status` = factor(
BCR_ABL1,
levels = c("Wt", "Mt"),
labels = c("Absence", "Presence")
),
`Driver status` = factor(
driver2,
levels = c("BCR_ABL", "WT", "Other"),
labels = c("*BCR::ABL1*", "Wild-type", "Other")
),
`Mean telomere length (bp)` = Length,
`Age (years)` = age_at_sample_exact,
group2 = paste0(Patient, "_", age_at_sample_exact, "_", `Driver status`)
) %>%
ggplot(
aes(
`Age (years)`,
`Mean telomere length (bp)`,
colour = `Driver status`,
group.by = group2
)
) +
geom_quasirandom(width = 1, alpha = 0.6) +
scale_colour_manual(values = driver.palette) +
theme_bw(base_size = 16) +
theme(legend.title = element_markdown(), legend.text = element_markdown()) +
facet_grid( ~ Patient, scales = "free_x") +
guides(color = guide_legend(override.aes = list(linetype = c(rep(0, length(driver.palette)))))) +
new_scale_colour() +
scale_color_grey() +
geom_abline(
data = as_tibble(list(
a = 4512.38235,
b = -30.80877,
Reference = "Mitchell et al. -30bp/year"
)),
aes(
intercept = a,
slope = b,
colour = Reference
),
lty = 2,
show.legend = TRUE
) +
facetted_pos_scales(x = x_scale[c(
"PD57334",
"PD56961",
"PD57335",
"PD51634",
"PD51633",
"PD51632",
"PD51635",
"PD57333",
"PD57332"
)])
Unlike the SNV burden analysis where we have a well formed correction for sequencing depth given sensitivity of germline SNP recall and sample VAF, telomerecat telomere length estimation is likely to incur significant batch effects.
Telomerecat estimates the mean telomere length using counts of telomeric reads (F1) and telomere boundary reads (F2_a). This makes it robust to ploidy with the assumption that F1 and F2_a reads are recalled at the same rate.
We hypothesised that this assumption could be affected by:
Lower read depth - causing telomere boundary reads (F2_a) to be stochastically undersampled thereby giving an overestimate of telomere length
Sequencing library preparation - causing batch to batch variation in the ratio of F1/F2a reads
We could assume that in an diploid unbiased sample the number of telomere boundary reads (F2_a) divided by 46 should equal the sample sequencing coverage (Seq.X).
In fact we see divergences from this assumption in 3-4 of our Patients with differences across the BCR::ABL1 fusion status.
per_sample_statistics.n834.df %>%
mutate(`*BCR::ABL1* status` = factor(
BCR_ABL1,
levels = c("Wt", "Mt"),
labels = c("Absence", "Presence")
)) %>%
ggplot(aes(Seq.X, F2a / 46, color = `*BCR::ABL1* status`)) +
geom_point(alpha =0.6) +
scale_colour_manual(values = driver.palette2b) +
theme_bw(base_size = 16) +
theme(
legend.title = element_markdown(),
legend.text = element_markdown(),
legend.position = "bottom"
) +
geom_abline(intercept = 0, slope = 1) +
facet_grid(~ Patient)
The ratio of telomere boundary reads (F2a) to the sum of all boundary reads with telomeric repeats (F2 + F4) is lower in these samples, where we would expect the ratio to be similar at the patient-level.
per_sample_statistics.n834.df %>%
ggplot(aes(Seq.X, F2a / 46, color = F2a / (F2 + F4))) +
geom_point() +
scale_color_viridis_c(option = "A",
trans = scales::pseudo_log_trans(sigma = 0.001)) +
theme_bw(base_size = 16) +
theme(
legend.position = "bottom"
) + geom_abline(intercept = 0, slope = 1) +
facet_grid( ~
Patient)
We defined two candidate batch variables:
The library preparation cluster (library.cluster), that is the library preparation date clustered by adjacent days (n=22)
The unique sequence run ID (run_id.uniq) which is the concatenated flowcell IDs of for each sample (n=31)
With “run_id.uniq” a more granular variable which accounts for both library preparation cluster and sequencing batch.
library.cluster.panel <-
per_sample_statistics.n834.df %>%
mutate(F2a_ratio = F2a / (F2 + F4)) %>%
filter(Patient %in% c("PD51632", "PD56961")) %>%
mutate(`*BCR::ABL1* status` = factor(
BCR_ABL1,
levels = c("Wt", "Mt"),
labels = c("Absence", "Presence")
)) %>%
select(
Patient,
`*BCR::ABL1* status`,
age_at_sample_exact,
library.cluster,
Length,
F2a_ratio,
Seq.X
) %>% pivot_longer(cols = Length:Seq.X) %>%
mutate(name = factor(name, levels = c("Seq.X", "F2a_ratio", "Length"))) %>%
ggplot(aes(value, library.cluster, colour = `*BCR::ABL1* status`)) +
geom_quasirandom(width = 0.2 , alpha = 0.6) +
scale_colour_manual(values = driver.palette2b) +
theme_bw(base_size = 12) +
theme(
legend.title = element_markdown(),
legend.text = element_markdown(),
legend.position = "bottom"
) +
facet_grid(Patient + age_at_sample_exact ~ name, scales = "free")
run_id.uniq.panel <-
per_sample_statistics.n834.df %>%
mutate(F2a_ratio = F2a / (F2 + F4)) %>%
filter(Patient %in% c("PD51632", "PD56961")) %>%
mutate(`*BCR::ABL1* status` = factor(
BCR_ABL1,
levels = c("Wt", "Mt"),
labels = c("Absence", "Presence")
)) %>%
select(
Patient,
`*BCR::ABL1* status`,
age_at_sample_exact,
run_id.uniq,
Length,
F2a_ratio,
Seq.X
) %>% pivot_longer(cols = Length:Seq.X) %>%
mutate(name = factor(name, levels = c("Seq.X", "F2a_ratio", "Length"))) %>%
ggplot(aes(value, run_id.uniq, colour = `*BCR::ABL1* status`)) +
geom_quasirandom(width = 0.2 , alpha = 0.6) +
scale_colour_manual(values = driver.palette2b) +
theme_bw(base_size = 12) +
theme(
legend.title = element_markdown(),
legend.text = element_markdown(),
legend.position = "bottom"
) +
facet_grid(Patient + age_at_sample_exact ~ name, scales = "free")
(library.cluster.panel / run_id.uniq.panel) + plot_layout(guides = 'collect') &
theme(legend.position='bottom')
## Orientation inferred to be along y-axis; override with `position_quasirandom(orientation = 'x')`
## Orientation inferred to be along y-axis; override with `position_quasirandom(orientation = 'x')`
Linear mixed models implemented in the R package “lme4” were used due to the repeated measures at the patient-level in the data set. Models were fitted with default “lme4” parameters, if a model did not converge, lme4::allFit() was used to refit the model to all available optimisers (provide by the “lme4”, “optimx”, and “dfoptim” R packages), the best optimiser was selected from non-singular and converged refits with the highest negative log-likelihood. Only non-singular and converged models were considered for model selection using the Bayesian information criterion (BIC).
To identify an effective batch effect variable we restrict the data to BCR::ABL-ve samples (n=469) and compare each candidate batch effect variable to a baseline model with “age at sample” as a fixed effect and “patient” as a random effect.
tel_wt.lmer_0 <-
lmer(
Length ~ age_at_sample_exact + (1 | Patient),
data = per_sample_statistics.n834.df%>% filter(BCR_ABL1 == "Wt"), REML = F
)
model.check(tel_wt.lmer_0)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + (1 | Patient) |
| Observations | 469 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
tel_wt.lmer_0b <-
lmer(
Length ~ age_at_sample_exact + (1 | Patient) + (1 | library.cluster),
data = per_sample_statistics.n834.df %>% filter(BCR_ABL1 == "Wt"), REML = F
)
model.check(tel_wt.lmer_0b)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + (1 | Patient) + (1 | library.cluster) |
| Observations | 469 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
tel_wt.lmer_0c <-
lmer(
Length ~ age_at_sample_exact + (1 | Patient) + (1 | run_id.uniq),
data = per_sample_statistics.n834.df %>% filter(BCR_ABL1 == "Wt"), REML = F
)
model.check(tel_wt.lmer_0c)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + (1 | Patient) + (1 | run_id.uniq) |
| Observations | 469 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
Across all models the mean telomere length attrition rates estimates (per year) are compatible with Mitchell et al. 2022, showing that NovaSeq derived telomerecat mean telomere length can be used.
Both candidate batch variables improve the baseline model’s fit to the data, with ” + (1 | run_id.uniq)” (tel.lmer_0c; BIC=7483.33) identified as the better variable over ” + (1 | library.cluster)” (tel.lmer_0b; BIC=7507.51).
tab_model(
tel_wt.lmer_0,
tel_wt.lmer_0b,
tel_wt.lmer_0c,
show.se = T,
show.stat = T,
show.ci = F,
show.p = F,
p.style = "scientific",
dv.labels = c(
"Baseline (tel_wt.lmer_0)",
"library.cluster random effect (tel_wt.lmer_0b)",
"run_id.uniq random effect (tel_wt.lmer_0c)"
)
)
| Baseline (tel_wt.lmer_0) | library.cluster random effect (tel_wt.lmer_0b) | run_id.uniq random effect (tel_wt.lmer_0c) | |||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | std. Error | Statistic | Estimates | std. Error | Statistic | Estimates | std. Error | Statistic |
| (Intercept) | 6405.08 | 402.65 | 15.91 | 6630.98 | 429.51 | 15.44 | 6473.68 | 523.02 | 12.38 |
| Age(years) | -37.43 | 7.39 | -5.07 | -41.48 | 7.95 | -5.21 | -36.60 | 9.66 | -3.79 |
| Random Effects | |||||||||
| σ2 | 485708.47 | 460160.79 | 422854.55 | ||||||
| τ00 | 85546.75 Patient | 74844.06 library.cluster | 211454.29 run_id.uniq | ||||||
| 35940.64 Patient | 17558.01 Patient | ||||||||
| ICC | 0.15 | 0.19 | 0.35 | ||||||
| N | 8 Patient | 8 Patient | 8 Patient | ||||||
| 19 library.cluster | 21 run_id.uniq | ||||||||
| Observations | 469 | 469 | 469 | ||||||
| Marginal R2 / Conditional R2 | 0.300 / 0.405 | 0.345 / 0.472 | 0.264 / 0.523 | ||||||
broom::tidy(anova(tel_wt.lmer_0, tel_wt.lmer_0b)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
| term | npar | AIC | BIC | logLik | deviance | statistic | df | p.value |
|---|---|---|---|---|---|---|---|---|
| tel_wt.lmer_0 | 4 | 7497.97 | 7514.57 | -3744.99 | 7489.97 | NA | NA | NA |
| tel_wt.lmer_0b | 5 | 7486.75 | 7507.51 | -3738.38 | 7476.75 | 13.22 | 1 | 0.000277 |
broom::tidy(anova(tel_wt.lmer_0, tel_wt.lmer_0c)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
| term | npar | AIC | BIC | logLik | deviance | statistic | df | p.value |
|---|---|---|---|---|---|---|---|---|
| tel_wt.lmer_0 | 4 | 7497.97 | 7514.57 | -3744.99 | 7489.97 | NA | NA | NA |
| tel_wt.lmer_0c | 5 | 7462.58 | 7483.33 | -3726.29 | 7452.58 | 37.39 | 1 | 9.66e-10 |
With a batch variable now selected we now look to model the effect of BCR::ABL1 fusion status on mean telomere length, we will build the model (accounting for batch) in a stepwise fashion:
Confirm a BCR::ABL fusion status effect as a fixed effect only
Test the addition of explanatory variables to the random effect configuration
Using the full dataset (n=834), we compare the inclusion of BCR::ABL1 fusion status as a fixed effect to a null model “Length ~ age_at_sample_exact + (1 | Patient) + (1| run_id.uniq)”.
tel.lmer2_0 <-
lmer(
Length ~ age_at_sample_exact + (1 | Patient) + (1| run_id.uniq),
data = per_sample_statistics.n834.df , REML = F
)
model.check(tel.lmer2_0)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + (1 | Patient) + (1 | run_id.uniq) |
| Observations | 834 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
tel.lmer2_1 <-
lmer(
Length ~ age_at_sample_exact + BCR_ABL1 + (1 | Patient) + (1| run_id.uniq),
data = per_sample_statistics.n834.df, REML = F
)
model.check(tel.lmer2_1)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + BCR_ABL1 + (1 | Patient) + (1 | run_id.uniq) |
| Observations | 834 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
Adding BCR::ABL fusion status as a fixed effect improves the model fit to the data (tel.lmer2_1; BIC=1.326602^{4}) over the null model (tel.lmer2_0; BIC=1.329903^{4}), confirming a BCR::ABL fusion status effect.
tab_model(
tel.lmer2_0,
tel.lmer2_1,
show.se = T,
show.stat = T,
show.ci = F,
show.p = F,
p.style = "scientific",
dv.labels = c("Baseline (SNV.lmer_0)", "Driver as fixed effect (SNV.lmer2_1)")
)
| Baseline (SNV.lmer_0) | Driver as fixed effect (SNV.lmer2_1) | |||||
|---|---|---|---|---|---|---|
| Predictors | Estimates | std. Error | Statistic | Estimates | std. Error | Statistic |
| (Intercept) | 5103.59 | 470.05 | 10.86 | 5746.17 | 472.78 | 12.15 |
| Age(years) | -14.43 | 8.54 | -1.69 | -21.58 | 8.46 | -2.55 |
| BCR::ABL 1 status: Mt | -627.53 | 97.07 | -6.47 | |||
| Random Effects | ||||||
| σ2 | 426493.29 | 409590.95 | ||||
| τ00 | 333723.54 run_id.uniq | 232305.56 run_id.uniq | ||||
| 26004.99 Patient | 47772.75 Patient | |||||
| ICC | 0.46 | 0.41 | ||||
| N | 9 Patient | 9 Patient | ||||
| 31 run_id.uniq | 31 run_id.uniq | |||||
| Observations | 834 | 834 | ||||
| Marginal R2 / Conditional R2 | 0.044 / 0.481 | 0.185 / 0.516 | ||||
broom::tidy(anova(tel.lmer2_0, tel.lmer2_1)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
| term | npar | AIC | BIC | logLik | deviance | statistic | df | p.value |
|---|---|---|---|---|---|---|---|---|
| tel.lmer2_0 | 5 | 13275.40 | 13299.03 | -6632.70 | 13265.40 | NA | NA | NA |
| tel.lmer2_1 | 6 | 13237.67 | 13266.02 | -6612.83 | 13225.67 | 39.73 | 1 | 2.92e-10 |
We now want to improve the baseline model (tel.lmer2_1), so we test whether “Patient” has an effect on the slope as well as the intercept in 3 models;
tel.lmer2_2 <-
lmer(
Length ~ age_at_sample_exact + BCR_ABL1 + (1 + age_at_sample_exact | Patient) + (1 | run_id.uniq),
data = per_sample_statistics.n834.df,
REML = F
)
## boundary (singular) fit: see help('isSingular')
model.check(tel.lmer2_2)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + BCR_ABL1 + (1 + age_at_sample_exact | Patient) + (1 | run_id.uniq) |
| Observations | 834 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | TRUE |
| Converged | FALSE |
tel.lmer2_3 <-
lmer(
Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq),
data = per_sample_statistics.n834.df, REML = F
)
model.check(tel.lmer2_3)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq) |
| Observations | 834 |
| REML | FALSE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
Model 2 was singular suggesting the random effect structure is too complex for the data and was dropped from the comparison to the baseline model (tel.lmer2_1:
Incorporation of BCR::ABL fusion status into the patient-level random effect improves model fit to the data (tel.lmer2_3; BIC=13265.15) over the baseline model (tel.lmer2_1; BIC=13266.02).
tab_model(
tel.lmer2_1,
tel.lmer2_3,
show.se = T,
show.stat = T,
show.ci = F,
show.p = F,
p.style = "scientific",
dv.labels = c(
"Baseline (tel.lmer2_1)",
"Random driver effect and intercept: (tel.lmer2_3)"
)
)
| Baseline (tel.lmer2_1) | Random driver effect and intercept: (tel.lmer2_3) | |||||
|---|---|---|---|---|---|---|
| Predictors | Estimates | std. Error | Statistic | Estimates | std. Error | Statistic |
| (Intercept) | 5746.17 | 472.78 | 12.15 | 6221.92 | 412.72 | 15.08 |
| Age(years) | -21.58 | 8.46 | -2.55 | -31.57 | 7.56 | -4.17 |
| BCR::ABL 1 status: Mt | -627.53 | 97.07 | -6.47 | -550.88 | 209.88 | -2.62 |
| Random Effects | ||||||
| σ2 | 409590.95 | 399535.37 | ||||
| τ00 | 232305.56 run_id.uniq | 206550.37 run_id.uniq | ||||
| 47772.75 Patient | 12534.25 Patient | |||||
| τ11 | 264196.09 Patient.BCR_ABL1Mt | |||||
| ρ01 | -0.41 Patient | |||||
| ICC | 0.41 | 0.44 | ||||
| N | 9 Patient | 9 Patient | ||||
| 31 run_id.uniq | 31 run_id.uniq | |||||
| Observations | 834 | 834 | ||||
| Marginal R2 / Conditional R2 | 0.185 / 0.516 | 0.236 / 0.572 | ||||
broom::tidy(anova(tel.lmer2_1, tel.lmer2_3)) %>%
mutate(p.value = format.pval(p.value, digits = 3, eps = 1e-50)) %>% tab_df()
| term | npar | AIC | BIC | logLik | deviance | statistic | df | p.value |
|---|---|---|---|---|---|---|---|---|
| tel.lmer2_1 | 6 | 13237.67 | 13266.02 | -6612.83 | 13225.67 | NA | NA | NA |
| tel.lmer2_3 | 8 | 13227.34 | 13265.15 | -6605.67 | 13211.34 | 14.33 | 2 | 0.000774 |
tel.lmer2_3.REML <-
update(
tel.lmer2_3,
REML = T,
control = lmerControl(
optimizer = tel.lmer2_3@optinfo$optimizer,
optCtrl = tel.lmer2_3@optinfo$control
)
)
model.check(tel.lmer2_3.REML)
| Test | Value |
|---|---|
| Formula | Length ~ age_at_sample_exact + BCR_ABL1 + (1 + BCR_ABL1 | Patient) + (1 | run_id.uniq) |
| Observations | 834 |
| REML | TRUE |
| Optimizer | nloptwrap |
| Singular | FALSE |
| Converged | TRUE |
check_model(tel.lmer2_3.REML)
tab_model(
tel.lmer2_3.REML,
show.se = T,
show.stat = T,
show.ci = F,
show.p = F,
p.style = "scientific"
)
| Mean telomere length(bp) | |||
|---|---|---|---|
| Predictors | Estimates | std. Error | Statistic |
| (Intercept) | 6179.88 | 444.18 | 13.91 |
| Age(years) | -30.79 | 8.15 | -3.78 |
| BCR::ABL 1 status: Mt | -556.90 | 225.06 | -2.47 |
| Random Effects | |||
| σ2 | 399685.76 | ||
| τ00 run_id.uniq | 211135.35 | ||
| τ00 Patient | 24643.27 | ||
| τ11 Patient.BCR_ABL1Mt | 311111.95 | ||
| ρ01 Patient | -0.33 | ||
| ICC | 0.46 | ||
| N Patient | 9 | ||
| N run_id.uniq | 31 | ||
| Observations | 834 | ||
| Marginal R2 / Conditional R2 | 0.223 / 0.584 | ||
set.seed(1234)
tel.lmer2_3_par_boot <-
bootstrap(
tel.lmer2_3.REML,
.f = fixef,
type = "parametric",
B = 3000
)
tel.lmer2_3_par_boot
## Bootstrap type: parametric
##
## Number of resamples: 3000
##
## term observed rep.mean se bias
## 1 (Intercept) 6179.87522 6167.39458 485.735226 -12.4806434
## 2 age_at_sample_exact -30.78562 -30.52931 8.951039 0.2563043
## 3 BCR_ABL1Mt -556.89988 -555.87460 227.158700 1.0252743
##
## There were 1370 messages, 0 warnings, and 0 errors.
##
## The most commonly occurring message was: boundary (singular) fit: see help('isSingular')
tel.lmer2_3_par_boot2 <- tel.lmer2_3_par_boot
tel.lmer2_3_par_boot2$replicates <- tel.lmer2_3_par_boot$replicates[1:1000, ]
lmeresampler:::confint.lmeresamp(tel.lmer2_3_par_boot, type = "perc") %>% tab_df()
| term | estimate | lower | upper | type | level |
|---|---|---|---|---|---|
| (Intercept) | 6179.88 | 5207.41 | 7098.60 | perc | 0.95 |
| age_at_sample_exact | -30.79 | -47.83 | -13.24 | perc | 0.95 |
| BCR_ABL1Mt | -556.90 | -988.32 | -103.54 | perc | 0.95 |
keep.lgl.vec <- sapply(tel.lmer2_3_par_boot$message, is.null) & sapply(tel.lmer2_3_par_boot$warning, is.null) & sapply(tel.lmer2_3_par_boot$error, is.null)
tel.lmer2_3_par_boot3 <- tel.lmer2_3_par_boot
tel.lmer2_3_par_boot3$replicates <- tel.lmer2_3_par_boot$replicates[keep.lgl.vec, ][1:1000,]
lmeresampler:::confint.lmeresamp(tel.lmer2_3_par_boot3, type = "perc") %>% tab_df()
| term | estimate | lower | upper | type | level |
|---|---|---|---|---|---|
| (Intercept) | 6179.88 | 5216.22 | 7086.12 | perc | 0.95 |
| age_at_sample_exact | -30.79 | -48.16 | -12.59 | perc | 0.95 |
| BCR_ABL1Mt | -556.90 | -993.16 | -85.98 | perc | 0.95 |
d <-
ggpredict(
model = tel.lmer2_3.REML,
terms = c("age_at_sample_exact [0:90 by=5]", "BCR_ABL1"),
ci.lvl = 0.95,
type = "fe"
)
tel_final <-
d %>% as_tibble() %>%
mutate(group = case_when(group == "Wt" ~ "Absence", group == "Mt" ~ "Presence")) %>%
mutate(`*BCR::ABL1* status` := group) %>%
ggplot(aes(x = x,
y = predicted,
group = `*BCR::ABL1* status`,)) +
geom_line(aes(colour = `*BCR::ABL1* status`)) +
geom_ribbon(aes(ymin = conf.low,
ymax = conf.high,
fill = `*BCR::ABL1* status`),
alpha = 0.2) +
scale_colour_manual(name = "*BCR::ABL1* status", values = driver.palette2b) +
scale_fill_manual(name = "*BCR::ABL1* status", values = driver.palette2b) +
new_scale_fill() +
new_scale_colour() +
geom_quasirandom(
data = per_sample_statistics.n834.df %>%
mutate(
response = Length,
x = age_at_sample_exact,
group = driver2
) %>% mutate(`Driver status` = factor(
driver2,
levels = c("BCR_ABL", "WT", "Other"),
labels = c("*BCR::ABL1*", "Wild-type", "Other")
)) %>% mutate(`*BCR::ABL1* status` := group),
aes(x = x, y = response, colour = `Driver status`),
alpha = 0.5
) +
scale_colour_manual(values = driver.palette) +
# scale_fill_manual(values = driver.palette) +
xlab(get_x_title(d)) +
ylab(get_y_title(d)) +
guides(colour = guide_legend(override.aes = list(linetype = c(rep(
0, length(driver.palette)
))))) +
new_scale_colour() +
geom_abline(
data = as_tibble(
list(
a = 4512.38235,
b = -30.80877,
Reference = "Mitchell et al. -30bp/year"
)
),
aes(
intercept = a,
slope = b,
colour = Reference
),
lty = 2,
show.legend = TRUE
) +
scale_colour_manual(values = "black") +
ggtitle(unlist(str_split(
as.character(tel.lmer2_3.REML@call), pattern = "\\="
))[2]) +
theme_bw(base_size = 16) +
theme(legend.title = element_markdown(), legend.text = element_markdown())
tel_final
Per patient estimated mean telomere length (bp) by age of sampling, annotated by driver status and mixed model trend lines.
Per sample dot plot describing mean telomere length (bp) by sample clonal status for PD51635, faceted by time of sampling and annotated by per sample driver status. Clonal status was classified as samples with either BCR::ABL1, Other driver or coalescence after the first 100 mutations.
tel_length <- sym("Length")
tel_length_label <- attr(per_sample_statistics.n834.df$Length, "label")
Figure_3h.dat <-
per_sample_statistics.n834.df %>%
filter(Patient == "PD51635", !is.na(status)) %>%
mutate(CH = ifelse(status == "NOCH" & driver3 != "WT", "CH", status)) %>%
mutate(
group = paste0(Patient, CH),
Patient = factor(
Patient,
levels = c(
"PD57334",
"PD56961",
"PD57335",
"PD51634",
"PD51633",
"PD51632",
"PD51635",
"PD57333",
"PD57332"
)
),
`*BCR::ABL1* status` = factor(
BCR_ABL1,
levels = c("Wt", "Mt", "Normal"),
labels = c("Absence", "Presence", "Normal")
),
`Driver status` = factor(
driver2,
levels = c("BCR_ABL", "Other", "WT"),
labels = c("*BCR::ABL1*", "Other", "Wild-type")
),
`Clonal status` = factor(
CH,
levels = c("CH", "NOCH"),
labels = c("+", "-")
),
{
{
tel_length_label
}
} := !!tel_length,
`Mutations SNV (adjusted)` = nsub_adj,
`Age (years)` = age_at_sample_exact
)
Figure_3h <-
Figure_3h.dat %>%
ggplot(aes(
`Clonal status`,
!!sym(tel_length_label),
colour = `Driver status`,
group.by = group,
)) +
geom_quasirandom(width = 0.4, alpha = 0.5) +
scale_colour_manual(values = driver.palette) +
theme_bw(base_size = 16) +
theme(legend.text = element_markdown()) +
facet_nested(~ Patient + `Age (years)`, scales = "free_x") +
force_panelsizes(cols = c(1, 2)) + theme(panel.spacing = unit(0, "lines"))
Figure_3h
ggsave(filename = paste0("../figures/v1/", "figure_3h_V1.pdf"), plot = Figure_3h, width = 4.5, height =4.5 )
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS 15.2
##
## Matrix products: default
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] parameters_0.21.1 performance_0.10.4 sjPlot_2.8.14 lmeresampler_0.2.4 dfoptim_2023.1.0 optimx_2023-8.13 lme4_1.1-33 Matrix_1.5-1
## [9] patchwork_1.2.0 scales_1.3.0 ggh4x_0.3.0 ggnewscale_0.5.0 ggeffects_1.2.2 ggtext_0.1.2 ggbeeswarm_0.7.2 magrittr_2.0.3
## [17] bookdown_0.33 magick_2.7.4 rvest_1.0.3 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.2 purrr_1.0.1
## [25] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] backports_1.4.1 HLMdiag_0.5.0 Hmisc_5.1-0 systemfonts_1.0.4 plyr_1.8.8 splines_4.1.3 TH.data_1.1-2
## [8] digest_0.6.31 import_1.3.0 htmltools_0.5.5 fansi_1.0.4 checkmate_2.2.0 cluster_2.1.2 see_0.8.0
## [15] tzdb_0.3.0 modelr_0.1.10 vroom_1.6.1 sandwich_3.0-2 timechange_0.2.0 colorspace_2.1-0 ggdist_3.2.1
## [22] textshaping_0.3.6 haven_2.5.2 xfun_0.39 crayon_1.5.2 jsonlite_1.8.8 survival_3.2-13 zoo_1.8-12
## [29] glue_1.6.2 gtable_0.3.4 emmeans_1.8.7 sjstats_0.18.2 sjmisc_2.8.9 distributional_0.3.2 mvtnorm_1.1-3
## [36] Rcpp_1.0.10 viridisLite_0.4.2 xtable_1.8-4 gridtext_0.1.5 htmlTable_2.4.2 foreign_0.8-82 bit_4.0.5
## [43] Formula_1.2-5 datawizard_0.7.1 htmlwidgets_1.6.2 httr_1.4.5 ellipsis_0.3.2 pkgconfig_2.0.3 farver_2.1.1
## [50] nnet_7.3-17 sass_0.4.7 utf8_1.2.3 janitor_2.2.0 tidyselect_1.2.0 labeling_0.4.3 rlang_1.1.1
## [57] reshape2_1.4.4 effectsize_0.8.3 munsell_0.5.0 nlmeU_0.70-9 tools_4.1.3 cachem_1.0.8 cli_3.6.1
## [64] generics_0.1.3 sjlabelled_1.2.0 broom_1.0.4 evaluate_0.23 fastmap_1.1.1 ragg_1.2.5 yaml_2.3.7
## [71] knitr_1.45 bit64_4.0.5 nlme_3.1-155 pracma_2.4.2 xml2_1.3.3 compiler_4.1.3 rstudioapi_0.14
## [78] beeswarm_0.4.0 statmod_1.5.0 bslib_0.4.2 stringi_1.8.2 highr_0.10 lattice_0.21-8 commonmark_1.9.0
## [85] diagonals_6.4.0 nloptr_2.0.3 markdown_1.11 vctrs_0.6.2 pillar_1.9.0 lifecycle_1.0.4 jquerylib_0.1.4
## [92] estimability_1.4.1 data.table_1.14.8 insight_0.19.2 R6_2.5.1 gridExtra_2.3 vipor_0.4.5 codetools_0.2-18
## [99] boot_1.3-28 MASS_7.3-60 withr_2.5.2 multcomp_1.4-25 mgcv_1.8-39 bayestestR_0.13.1 parallel_4.1.3
## [106] hms_1.1.2 grid_4.1.3 rpart_4.1.16 coda_0.19-4 minqa_1.2.5 rmarkdown_2.25 snakecase_0.11.0
## [113] numDeriv_2016.8-1.1 base64enc_0.1-3